body{
  font-family: Helvetica;
  font-size: 16pt;
}
/* Headers */
h1,h2,h3,h4,h5,h6{
  font-size: 22pt;
}

Generating the samples

p_grid = seq(from=0 , to=1, length.out=1000)
prior = rep( 1 , 1000 )
likelihood = dbinom(6, size=9, prob=p_grid)
posterior = likelihood * prior
posterior = posterior / sum(posterior)
set.seed(100)
samples = sample( p_grid , prob=posterior , size=1e4 , replace=TRUE )

3E1 Proportion posterior < .2

It is always a good idea to plot a histogram of the posterior, if possible with meaningful axis limits

hist(samples,
     xlim = c(0,1))

# using that R interprets FALSE as 0 and TRUE as 1
mean(samples < .2)
## [1] 4e-04

3E2 Proportion posterior > .8

mean(samples > .8)
## [1] 0.1116

3E3 Proportion posterior > .2 && < .8

mean(samples > .2 & samples < .8)
## [1] 0.888

3E4 20% of posterior below x

Here we need to use the quantile function

quantile(samples,.2)
##       20% 
## 0.5185185

3E4 20% of posterior above x

quantile(samples,.8)
##       80% 
## 0.7557558

3E6 Border values of narrowest 66% interval

This is the highest density posterior interval.

HPDI(samples,.66)
##     |0.66     0.66| 
## 0.5085085 0.7737738

3E7 Border values of the central 66% interval

This is commonly called the credible interval.

c(
  quantile(samples,(1-.66)/2),
  quantile(samples,1-(1-.66)/2)
  )
##       17%       83% 
## 0.5025025 0.7697698

or

PI(samples,.66)
##       17%       83% 
## 0.5025025 0.7697698

3M1 Construct posterior for new data with grid approximation

Everything except the data is the same as at the beginning of the exercises

likelihood = dbinom(8, size=15, prob=p_grid)
posterior = likelihood * prior
posterior = posterior / sum(posterior)
plot(p_grid,posterior,'l') # use 'l' to get a line and not dots

3M2 10000 samples and 90% HDPI

We use the HDPI function from the rethinking package.

samples = sample( p_grid , prob=posterior , size=1e4 , replace=TRUE )
HPDI(samples, prob = .9)
##      |0.9      0.9| 
## 0.3293293 0.7167167

3M3 posterior predictive check

A few general tips

posterior_predictions = 
  rbinom(n = length(samples),
         size = 15,
         prob = samples)

par(mfrow = c(2,1))

posterior_predictions %>% 
  table() %>% 
  prop.table() %>% 
  plot(ylab = "Proportion of samples",
       xlab = "N water")

# Now the same result with hard-to-read code
plot(prop.table(table(posterior_predictions)),ylab = "Proportion of samples", xlab = "N water")

mean(posterior_predictions == 8)
## [1] 0.1444

The probability to get 8 out of 15 is not 1, but it is with 14% the most likely outcome.

3M4 Probability 6 in 9, given previous posterior

We only need to change the size, and can then look at the predictions.

posterior_predictions = 
  rbinom(n = length(samples),
         size = 9,
         prob = samples)

posterior_predictions %>% 
  table() %>% 
  prop.table() %>% 
  plot(ylab = "Proportion of samples",
       xlab = "N water")

mean(posterior_predictions == 6)
## [1] 0.1751

3M5 Probability 6 in 9, given previous posterior

Maybe this is a bit advanced, but we will write a little function that does all we need for us instead of doing everything manually:

get_samples = function(p_grid, prior, n_tosses, n_water) {
  likelihood = dbinom(n_water, size=n_tosses, prob=p_grid)
  posterior = likelihood * prior
  posterior = posterior / sum(posterior)
  set.seed(100)
  samples = sample( p_grid , prob=posterior , size=1e4 , replace=TRUE )
  return(samples)
}

get_my_results = function(p_grid, prior) {
  
  samples = get_samples(p_grid, prior, n_tosses = 9, n_water = 6)
  spls_15_8 = get_samples(p_grid, prior, n_tosses = 15, n_water = 8)
  
  results = c(
    p_below_0.2 = mean(samples < .2),                #E1
    p_above_0.8 = mean(samples > .8),                #E2
    p_betweeb_0.2_0.8 = mean(samples > .2 & samples < .8), #E3
    quantile_0.2 = quantile(samples,.2),             #E4
    quantile_0.8 = quantile(samples,.8),             #E5
    HDPI_66.lower = HPDI(samples,.66)[1],            #E6a
    HDPI_66.upper = HPDI(samples,.66)[2],            #E6b
    CI_66.lower = quantile(samples,(1-.66)/2),       #E7a
    CI_66.upper = quantile(samples,1-(1-.66)/2),     #E7b
    HDPI_90.lower = HPDI(spls_15_8,.90)[1],          #M2a
    HDPI_90.upper = HPDI(spls_15_8,.90)[2],          #M2b
    post_pred_8_15 = mean(rbinom(n = 10e4,size = 15,prob = spls_15_8) == 8), #M3
    post_pred_6_9 = mean(rbinom(n = 10e4,size = 9,prob = spls_15_8) == 6)   #M4
  )
  return(results)
}

get_my_results(p_grid = p_grid,
               prior = prior)
##         p_below_0.2         p_above_0.8   p_betweeb_0.2_0.8    quantile_0.2.20% 
##           0.0004000           0.1116000           0.8880000           0.5185185 
##    quantile_0.8.80% HDPI_66.lower.|0.66 HDPI_66.upper.0.66|     CI_66.lower.17% 
##           0.7557558           0.5085085           0.7737738           0.5025025 
##     CI_66.upper.83%  HDPI_90.lower.|0.9  HDPI_90.upper.0.9|      post_pred_8_15 
##           0.7697698           0.3343343           0.7217217           0.1478100 
##       post_pred_6_9 
##           0.1757100
prior_flat = rep(1,1000)
prior_05 = prior_flat
prior_05[p_grid < 0.5] = 0

results = 
  cbind(prior_flat = get_my_results(p_grid, prior = prior_flat),
        prior_0.5 = get_my_results(p_grid, prior = prior_05))

results
##                     prior_flat prior_0.5
## p_below_0.2          0.0004000 0.0000000
## p_above_0.8          0.1116000 0.1368000
## p_betweeb_0.2_0.8    0.8880000 0.8632000
## quantile_0.2.20%     0.5185185 0.5815816
## quantile_0.8.80%     0.7557558 0.7729730
## HDPI_66.lower.|0.66  0.5085085 0.5385385
## HDPI_66.upper.0.66|  0.7737738 0.7507508
## CI_66.lower.17%      0.5025025 0.5715716
## CI_66.upper.83%      0.7697698 0.7857858
## HDPI_90.lower.|0.9   0.3343343 0.5005005
## HDPI_90.upper.0.9|   0.7217217 0.7097097
## post_pred_8_15       0.1478100 0.1583700
## post_pred_6_9        0.1757100 0.2327600

To understand the differences, we can look at the entire posterior distributions.

Here are the posterior predictions for posteriors from different priors, and the predictions when using the true proportion of water.

Posterior predictions

For instance, our best guess for the probability of water with the stair-shapes prior and 8 out of 15 water landings is 0.6052395. Now we can simulate what happens if we plug this parameter in and make 10 predictions:

post_preds = 
  rbinom(n = 10,
       size = 15,
       prob = mean(post_0.5))
post_preds
##  [1]  7  7 10  8  9 11  9  9  6  6

What is the mean of these predictions, divided by the number of tosses?

mean(post_preds)/15
## [1] 0.5466667

What if we make not 10 but 1000 predictions:

post_preds = 
  rbinom(n = 1000,
       size = 15,
       prob = mean(post_0.5)) 
mean(post_preds)/15
## [1] 0.6036667

There we go. Averaged over many predictions, we get the right proportion, even if there is some variations in predictions from one parameter value, as can be seen in the next figure.

3M6 Credible Interval with width of .05

There is no unique solution for this question, because for proportions the standard error is a function of the proportion itself:

\[ se_{prop} = \sqrt{\frac{p(1-p)}{n}} \] We can illustrate this with an arbitrary sample size.

se_prop = function(p) {
  sqrt((p*(1-p))/50)
}

curve(se_prop(x),
      xlab = "proportion",
      ylab = "standard error")

For this exercise we assume that the target proportion is 0.7.

First we write a function that takes sample size n and proportion p as input and returns the width of the 99% CI.

interval_width = function(p,n) {
  p_grid = seq(0,1,length.out = 10000)
  likelihood = dbinom(round(n*p), size = n,  prob=p_grid)
  posterior = likelihood * prior
  posterior = posterior / sum(posterior)
  set.seed(123)
  samples = sample( p_grid , prob=posterior , size=1e5 , replace=TRUE )
  width = diff(PI(samples,prob = .99))
  return(width)
}

Next we try a coarse grid-search to find where roughly the correct sample size will be.

ns = seq(100,5000,250)
i.width = 
  do.call(c,
          lapply(ns, function(n) interval_width(.7,n))
  )
plot(ns,i.width,'l',
     ylab = "interval width",
     xlab = "sample size")
abline(h = .05)

Now the we know that it is around 2000, we can search with a finer grid around this area.

ns = seq(1500,3000,25)

i.width = 
  do.call(c,
        lapply(ns, function(n) interval_width(.7,n))
        )

minimum_sample_size = min(ns[i.width < .05])
minimum_sample_size
## [1] 2225

Here is a plot of the grid-search result. We also add plots for proportions of .6 and .8, just to show how this influences the width of the CI.